None
import pandas as pd
import plotly.express as px
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import plotly.express as px
from sklearn.decomposition import PCA
import plotly.offline
plotly.offline.init_notebook_mode()
from IPython.display import IFrame
linkdf = pd.read_csv("links.csv")
moviedf = pd.read_csv("movies_metadata.csv")
ratingdf = pd.read_csv("ratings.csv")
dftmdb = pd.DataFrame({"Id":ratingdf.groupby("movieId").rating.count().keys(),
"nrating":ratingdf.groupby("movieId").rating.count(),
"avgrating":ratingdf.groupby("movieId").rating.mean()})
# let us pick the movie with more than 1000 ratings
pickmovieId = dftmdb[dftmdb.nrating>1000].Id.unique()
sns.displot(dftmdb[dftmdb.nrating>1000], x="avgrating")
print("number of movies we choose (criterion: more than 1000 ratings):", len(dftmdb[dftmdb.nrating>1000]) )
moviedf["date"] = pd.to_datetime(moviedf.release_date, format = "%Y-%m-%d",\
errors="coerce")
moviedf["year"] = moviedf.date.dt.year
moviedf["month"] = moviedf.date.dt.month
linkdfpick = linkdf[linkdf.movieId.isin(pickmovieId)]
#linkdfpick["title"]=linkdfpick.imdbId.apply(lambda x:moviedf.title[moviedf.imdb_id == "tt{:07d}".format(x)].values[0])
a = list(pickmovieId)
for i,j in linkdfpick.iterrows():
if (len(moviedf.title[moviedf.imdb_id == "tt{:07d}".format(int(j.imdbId))])) < 1:
n = a.index(int(j.movieId))
del a[n]
pickmovieId = np.array(a)
linkdfpick = linkdf[linkdf.movieId.isin(pickmovieId)]
linkdfpick["title"]=linkdfpick.imdbId.apply(lambda x:moviedf.title[moviedf.imdb_id == "tt{:07d}".format(x)].values[0])
linkdfpick
ratingdfpick = ratingdf[ratingdf.movieId.isin(pickmovieId)]
user_review_num = ratingdfpick.groupby("userId").movieId.count().to_dict()
pickuserId = [i for i in user_review_num if user_review_num[i] > 10]
ratingdfpick = ratingdfpick[ratingdfpick.userId.isin(pickuserId)]
userold2new = {j:i for i,j in enumerate(ratingdfpick.userId.unique())}
usernew2old = {i:j for i,j in enumerate(ratingdfpick.userId.unique())}
ratingdfpick.userId = ratingdfpick.userId.apply(lambda x: userold2new[x])
ratingdfpick.head()
movieold2new = {j:i for i,j in enumerate(ratingdfpick.movieId.unique())}
movienew2old = {i:j for i,j in enumerate(ratingdfpick.movieId.unique())}
ratingdfpick.movieId = ratingdfpick.movieId.apply(lambda x: movieold2new[x])
ratingdfpick.head()
ratingdfpick["rating_norm"] = (ratingdfpick.rating - ratingdfpick.rating.mean())/ ratingdfpick.rating.var()
#ratingdfpick.to_csv("ratings_reduced.csv")
df_train, df_test = train_test_split(ratingdfpick, stratify = ratingdfpick.userId, test_size = 0.2)
print(df_train.info())
print("# of users:", len(df_train.userId.unique()))
print("# of movies:", len(df_train.movieId.unique()))
We assume that the movie and user can be represented by vectors such that the predicted rating $\hat{r}_{mu}$, the user $u$'s rating on the movie $m$, is the inner product of $V_u$ and $W_m$.
The vector model could be trained by minimizing the square error $$ \sum_{mu} (\hat{r}_{mu} - r_{mu})^2. $$
def SGD_without_bias(dftrain, dftest, vec_movies = None, vec_users = None, learning_rate = 0.002, Nepoch = 10, Ndf = 3):
Nusers = len(dftrain.userId.unique())
Nmovies = len(dftrain.movieId.unique())
if vec_users is None:
vec_users = np.zeros((Nusers, Ndf))
if vec_movies is None:
vec_movies = np.random.randn(Ndf, Nmovies)
else:
vec_movies = vec_movies.T
train_loss = []
test_loss = []
userIds_list_train = [df_train[dftrain.movieId == mg].userId for mg in range(Nmovies)]
scores_list_train = [df_train[dftrain.movieId == mg].rating_norm for mg in range(Nmovies)]
userIds_list_test = [df_test[dftest.movieId == mg].userId for mg in range(Nmovies)]
scores_list_test = [df_test[dftest.movieId == mg].rating_norm for mg in range(Nmovies)]
for epoch in range(Nepoch):
moviepermu = np.random.permutation(Nmovies)
trainloss = 0
for mg in moviepermu:
userIds = userIds_list_train[mg]
scores = scores_list_train[mg]
res = ((vec_users[userIds, :] @ vec_movies[:, mg:mg+1]).ravel()\
- scores.values)
step = (vec_users[userIds, :] * res.reshape(-1,1)).sum(axis = 0)
vec_movies[:,mg] -= np.clip(step * learning_rate, -0.5, 0.5)
vec_users[userIds, :] -= np.clip((vec_movies[:,mg:mg+1] * res).T * learning_rate,-0.7,0.7)
trainloss += (res**2).sum()
#pre = (((vec_users[i:i+1, :] @ vec_movies[:, movieIds]).ravel() - scores.values)**2).mean()
train_loss.append(trainloss / len(dftrain))
loss = 0
for mg in moviepermu:
userIds = userIds_list_test[mg]
scores = scores_list_test[mg]
res = ((vec_users[userIds, :] @ vec_movies[:, mg:mg+1]).ravel()\
- scores.values)
loss += (res**2).sum()
#pre = (((vec_users[i:i+1, :] @ vec_movies[:, movieIds]).ravel() - scores.values)**2).mean()
print("\r train loss:", trainloss / len(dftrain), "test loss:", loss / len(dftest), end = '')
test_loss.append(loss / len(dftest))
return vec_movies.T, vec_users, train_loss, test_loss
def SGD_with_onlybias(dftrain, dftest, bias_movies = None, bias_users = None, learning_rate = 0.001, Nepoch = 10, Ndf = 3):
Nusers = len(dftrain.userId.unique())
Nmovies = len(dftrain.movieId.unique())
if bias_users is None:
bias_users = np.zeros(Nusers)
if bias_movies is None:
bias_movies = np.zeros(Nmovies)
train_loss = []
test_loss = []
userIds_list_train = [df_train[dftrain.movieId == mg].userId for mg in range(Nmovies)]
scores_list_train = [df_train[dftrain.movieId == mg].rating_norm for mg in range(Nmovies)]
userIds_list_test = [df_test[dftest.movieId == mg].userId for mg in range(Nmovies)]
scores_list_test = [df_test[dftest.movieId == mg].rating_norm for mg in range(Nmovies)]
for epoch in range(Nepoch):
moviepermu = np.random.permutation(Nmovies)
trainloss = 0
for mg in moviepermu:
userIds = userIds_list_train[mg]
scores = scores_list_train[mg]
res = (bias_movies[mg]\
+ bias_users[userIds]\
- scores.values)
bias_movies[mg] -= np.clip(learning_rate * res.sum(), -0.5, 0.5)
bias_users[userIds] -= np.clip(learning_rate * res, -0.7, 0.7)
trainloss += (res**2).sum()
#pre = (((vec_users[i:i+1, :] @ vec_movies[:, movieIds]).ravel() - scores.values)**2).mean()
train_loss.append(trainloss / len(dftrain))
loss = 0
for mg in moviepermu:
userIds = userIds_list_test[mg]
scores = scores_list_test[mg]
res = (bias_movies[mg]\
+ bias_users[userIds]\
- scores.values)
loss += (res**2).sum()
#pre = (((vec_users[i:i+1, :] @ vec_movies[:, movieIds]).ravel() - scores.values)**2).mean()
print("\r train loss:", trainloss / len(dftrain), "test loss:", loss / len(dftest), end = '')
test_loss.append(loss / len(dftest))
return bias_movies, bias_users, train_loss, test_loss
def SGD_with_fixedbias(dftrain, dftest, bias_movies, bias_users, vec_movies = None, vec_users = None, learning_rate = 0.002, Nepoch = 10, Ndf = 3):
Nusers = len(dftrain.userId.unique())
Nmovies = len(dftrain.movieId.unique())
if vec_users is None:
vec_users = np.zeros((Nusers, Ndf))
if vec_movies is None:
vec_movies = np.random.randn(Ndf, Nmovies)
else:
vec_movies = vec_movies.T
train_loss = []
test_loss = []
userIds_list_train = [df_train[dftrain.movieId == mg].userId for mg in range(Nmovies)]
scores_list_train = [df_train[dftrain.movieId == mg].rating_norm for mg in range(Nmovies)]
userIds_list_test = [df_test[dftest.movieId == mg].userId for mg in range(Nmovies)]
scores_list_test = [df_test[dftest.movieId == mg].rating_norm for mg in range(Nmovies)]
for epoch in range(Nepoch):
moviepermu = np.random.permutation(Nmovies)
trainloss = 0
for mg in moviepermu:
userIds = userIds_list_train[mg]
scores = scores_list_train[mg]
res = ((vec_users[userIds, :] @ vec_movies[:, mg:mg+1]).ravel()\
+ bias_movies[mg]\
+ bias_users[userIds]\
- scores.values)
step = (vec_users[userIds, :] * res.reshape(-1,1)).sum(axis = 0)
vec_movies[:,mg] -= np.clip(step * learning_rate, -0.5, 0.5)
vec_users[userIds, :] -= np.clip((vec_movies[:,mg:mg+1] * res).T * learning_rate,-0.7,0.7)
trainloss += (res**2).sum()
#pre = (((vec_users[i:i+1, :] @ vec_movies[:, movieIds]).ravel() - scores.values)**2).mean()
train_loss.append(trainloss / len(dftrain))
loss = 0
for mg in moviepermu:
userIds = userIds_list_test[mg]
scores = scores_list_test[mg]
res = ((vec_users[userIds, :] @ vec_movies[:, mg:mg+1]).ravel()\
+ bias_movies[mg]\
+ bias_users[userIds]\
- scores.values)
loss += (res**2).sum()
#pre = (((vec_users[i:i+1, :] @ vec_movies[:, movieIds]).ravel() - scores.values)**2).mean()
print("\r train loss:", trainloss / len(dftrain), "test loss:", loss / len(dftest), end = '')
test_loss.append(loss / len(dftest))
return vec_movies.T, vec_users, bias_movies, bias_users, train_loss, test_loss
def SGD_with_bias(dftrain, dftest, bias_movies = None, bias_users = None, vec_movies = None, vec_users = None, learning_rate = 0.002, Nepoch = 10, Ndf = 3):
Nusers = len(dftrain.userId.unique())
Nmovies = len(dftrain.movieId.unique())
if bias_users is None:
bias_users = np.zeros(Nusers)
if bias_movies is None:
bias_movies = np.zeros(Nmovies)
if vec_users is None:
vec_users = np.zeros((Nusers, Ndf))
if vec_movies is None:
vec_movies = np.random.randn(Ndf, Nmovies)
else:
vec_movies = vec_movies.T
train_loss = []
test_loss = []
userIds_list_train = [df_train[dftrain.movieId == mg].userId for mg in range(Nmovies)]
scores_list_train = [df_train[dftrain.movieId == mg].rating_norm for mg in range(Nmovies)]
userIds_list_test = [df_test[dftest.movieId == mg].userId for mg in range(Nmovies)]
scores_list_test = [df_test[dftest.movieId == mg].rating_norm for mg in range(Nmovies)]
for epoch in range(Nepoch):
moviepermu = np.random.permutation(Nmovies)
trainloss = 0
for mg in moviepermu:
userIds = userIds_list_train[mg]
scores = scores_list_train[mg]
res = ((vec_users[userIds, :] @ vec_movies[:, mg:mg+1]).ravel()\
+ bias_movies[mg]\
+ bias_users[userIds]\
- scores.values)
bias_movies[mg] -= np.clip(learning_rate * res.sum(), -0.2, 0.2)
bias_users[userIds] -= np.clip(learning_rate * res, -0.3, 0.3)
step = (vec_users[userIds, :] * res.reshape(-1,1)).sum(axis = 0)
vec_movies[:,mg] -= np.clip(step * learning_rate, -0.3, 0.3)
vec_users[userIds, :] -= np.clip((vec_movies[:,mg:mg+1] * res).T * learning_rate,-0.4,0.4)
trainloss += (res**2).sum()
#pre = (((vec_users[i:i+1, :] @ vec_movies[:, movieIds]).ravel() - scores.values)**2).mean()
train_loss.append(trainloss / len(dftrain))
loss = 0
for mg in moviepermu:
userIds = userIds_list_test[mg]
scores = scores_list_test[mg]
res = ((vec_users[userIds, :] @ vec_movies[:, mg:mg+1]).ravel()\
+ bias_movies[mg]\
+ bias_users[userIds]\
- scores.values)
loss += (res**2).sum()
#pre = (((vec_users[i:i+1, :] @ vec_movies[:, movieIds]).ravel() - scores.values)**2).mean()
print("\r train loss:", trainloss / len(dftrain), "test loss:", loss / len(dftest), end = '')
test_loss.append(loss / len(dftest))
return vec_movies.T, vec_users, bias_movies, bias_users, train_loss, test_loss
def ALS_with_bias(dftrain, dftest, bias_movies, bias_users, vec_movies = None, vec_users = None, learning_rate = 0.002, Nepoch = 10, Ndf = 3):
Nusers = len(dftrain.userId.unique())
Nmovies = len(dftrain.movieId.unique())
if vec_users is None:
vec_users = np.zeros((Nusers, Ndf))
if vec_movies is None:
vec_movies = np.random.randn(Ndf, Nmovies)
else:
vec_movies = vec_movies.T
train_loss = []
test_loss = []
userIds_list_train = [df_train[dftrain.movieId == mg].userId for mg in range(Nmovies)]
scores_list_train = [df_train[dftrain.movieId == mg].rating_norm for mg in range(Nmovies)]
userIds_list_test = [df_test[dftest.movieId == mg].userId for mg in range(Nmovies)]
scores_list_test = [df_test[dftest.movieId == mg].rating_norm for mg in range(Nmovies)]
for epoch in range(Nepoch):
grad_vec_movies = np.zeros_like(vec_movies)
grad_vec_users = np.zeros_like(vec_users)
grad_bias_movies = np.zeros_like(bias_movies)
grad_bias_users = np.zeros_like(bias_users)
moviepermu = np.random.permutation(Nmovies)
trainloss = 0
for mg in moviepermu:
userIds = userIds_list_train[mg]
scores = scores_list_train[mg]
res = ((vec_users[userIds, :] @ vec_movies[:, mg:mg+1]).ravel()\
+ bias_movies[mg]\
+ bias_users[userIds]\
- scores.values)
step = (vec_users[userIds, :] * res.reshape(-1,1)).sum(axis = 0)
grad_bias_movies[mg] += learning_rate * res.sum()
grad_bias_users[userIds] += learning_rate * res
grad_vec_movies[:,mg] += step * learning_rate
grad_vec_users[userIds, :] += (vec_movies[:,mg:mg+1] * res).T * learning_rate
trainloss += (res**2).sum()
#pre = (((vec_users[i:i+1, :] @ vec_movies[:, movieIds]).ravel() - scores.values)**2).mean()
bias_movies -= grad_bias_movies
bias_users -= grad_bias_users
vec_movies -= grad_vec_movies
vec_users -= grad_vec_users
train_loss.append(trainloss / len(dftrain))
loss = 0
for mg in moviepermu:
userIds = userIds_list_test[mg]
scores = scores_list_test[mg]
res = ((vec_users[userIds, :] @ vec_movies[:, mg:mg+1]).ravel()\
+ bias_movies[mg]\
+ bias_users[userIds]\
- scores.values)
loss += (res**2).sum()
#pre = (((vec_users[i:i+1, :] @ vec_movies[:, movieIds]).ravel() - scores.values)**2).mean()
print("\r train loss:", trainloss / len(dftrain), "test loss:", loss / len(dftest), end = '')
test_loss.append(loss / len(dftest))
return vec_movies.T, vec_users, bias_movies, bias_users, train_loss, test_loss
outnew3 = SGD_without_bias(df_train,df_test, Nepoch = 120, learning_rate = 0.001, Ndf=4)
pca = PCA()
pca.fit(outnew3[0].copy())
print(pca.explained_variance_)
movie_vec_pca = outnew3[0] @ pca.components_.T
user_vec_pca = outnew3[1] @ pca.components_.T
movie_vec_pca[:,0] *= -1
user_vec_pca[:,0] *= -1
print("user vectors")
print("variance:", user_vec_pca.var(axis=0), "mean:", user_vec_pca.mean(axis=0))
print("movie vectors")
print("variance:", movie_vec_pca.var(axis=0),"mean:", movie_vec_pca.mean(axis=0))
movie_factor = pd.DataFrame(movie_vec_pca, columns = ["comp_1", "comp_2", "comp_3", "comp_4"])
movie_factor["movieId"] = movie_factor.index.map(lambda x: movienew2old[x])
movie_factor["title"] = movie_factor["movieId"].apply(lambda x: linkdfpick.title[linkdfpick.movieId == x].values[0])
fig = px.scatter_3d(movie_factor, x="comp_1", y="comp_2", z="comp_3", hover_name="title",color = "comp_1")
fig.update_layout(
margin=dict(l=30, r=30, t=30, b=30),
)
fig.write_html("NoBias3D.html")
IFrame(src='./NoBias3D.html', width=800, height=700)
fig = px.scatter(movie_factor, x="comp_2", y="comp_3", hover_name="title",color = "comp_1")
fig.update_layout(
margin=dict(l=30, r=30, t=30, b=30),
)
fig.write_html("NoBias2D.html")
IFrame(src='./NoBias2D.html', width=800, height=700)
We use four components for the latent factor vector. Actually, after PCA, the 1st component (largest variance) is almost the average rating of the movie. The 2nd and 3rd component represents the property of movies. The last component is almost a constant for movies, but it has large variance for user vectors. It implies that the last component represent the average rating of a user. Therefore, the first and the last components are actually the bias of movies and the bias of users respectively. As follows, I also train model with bias and two-component vector (3 parameters for each movie or user), and it provides a more explanatory result, though the validation loss is a little bit larger.
We can improve the model by considering the bias of movies and users. (Consider there is some common goodness everyone agree with. Also, some people might tend to give relatively low score to all kinds of movies.) $$ \hat{r}_{mu} = V_u W_m^T + b_u + b_m $$
outwithbias = SGD_with_bias(df_train,df_test, Nepoch = 120, learning_rate = 0.002, Ndf=2)
# curve of train loss and validation loss
sns.set_style("whitegrid")
sns.set_context("talk")
plt.plot(outwithbias[4], label = "train loss with bias")
plt.plot(outwithbias[5], label = "test loss with bias")
plt.plot(outnew3[2], label = "train loss without bias")
plt.plot(outnew3[3], label = "test loss without bias")
plt.legend()
plt.xlabel("iteration")
plt.ylabel("MSE loss")
movie_factor_bias = pd.DataFrame(outwithbias[0], columns = ["comp_1", "comp_2"])
movie_factor_bias["bias"] = outwithbias[2]
movie_factor_bias["movieId"] = movie_factor_bias.index.map(lambda x: movienew2old[x])
movie_factor_bias["title"] = movie_factor_bias["movieId"].apply(
lambda x: linkdfpick.title[linkdfpick.movieId == x].values[0])
fig = px.scatter_3d(movie_factor_bias, x="comp_1", y="comp_2", z="bias", hover_name="title",color = "bias")
fig.update_layout(
margin=dict(l=30, r=30, t=30, b=30),
)
fig.write_html("Bias3D.html")
IFrame(src='./Bias3D.html', width=800, height=700)
fig = px.scatter(movie_factor_bias, x="comp_1", y="comp_2", hover_name="title",color = "bias")
fig.update_layout(
margin=dict(l=30, r=30, t=30, b=30),
)
fig.write_html("Bias2D.html")
IFrame(src='./Bias2D.html', width=800, height=700)
imdfidstr = linkdfpick.imdbId.apply(lambda x: "tt{:07d}".format(x))
imdfidstr
moviedfpick = moviedf[moviedf.imdb_id.isin(imdfidstr)]
# (moviedf[moviedf.imdb_id == "tt{:07d}".format(x)].values[0][1])
moviedfpick["movieId"] = moviedfpick.imdb_id.apply(
lambda x: linkdfpick.movieId[linkdfpick.imdbId == int(x[2:])].values[0])
moviedfpick["latent_vec_0"] = moviedfpick.movieId.apply(
lambda x: movie_factor_bias.comp_1[movie_factor_bias.movieId == x].values[0])
moviedfpick["latent_vec_1"] = moviedfpick.movieId.apply(
lambda x: movie_factor_bias.comp_2[movie_factor_bias.movieId == x].values[0])
moviedfpick["bias"] = moviedfpick.movieId.apply(
lambda x: movie_factor_bias.bias[movie_factor_bias.movieId == x].values[0])
moviedfpick["latent_vec_nobias_0"] = moviedfpick.movieId.apply(
lambda x: movie_factor.comp_1[movie_factor.movieId == x].values[0])
moviedfpick["latent_vec_nobias_1"] = moviedfpick.movieId.apply(
lambda x: movie_factor.comp_2[movie_factor.movieId == x].values[0])
moviedfpick["latent_vec_nobias_2"] = moviedfpick.movieId.apply(
lambda x: movie_factor.comp_3[movie_factor.movieId == x].values[0])
moviedfpick["latent_vec_nobias_3"] = moviedfpick.movieId.apply(
lambda x: movie_factor.comp_4[movie_factor.movieId == x].values[0])
moviedfpick.to_csv("movie_recommendation_pick.csv", index = False)
fig = px.scatter(moviedfpick, x="latent_vec_0", y="latent_vec_1", hover_name="title",color = "bias")
fig.update_layout(
margin=dict(l=30, r=30, t=30, b=30),
)
fig.write_html("Bias2D_2.html")
IFrame(src='./Bias2D_2.html', width=800, height=700)
fig = px.scatter(moviedfpick, x="latent_vec_nobias_1", y="latent_vec_nobias_2", hover_name="title",color = "latent_vec_nobias_0")
fig.update_layout(
margin=dict(l=30, r=30, t=30, b=30),
)
fig.write_html("NoBias2D_2.html")
IFrame(src='./NoBias2D_2.html', width=800, height=700)
user_factor_nobias = pd.DataFrame(user_vec_pca, columns = ["comp_1", "comp_2", "comp_3", "comp_4"])
user_factor_nobias["userId"] = user_factor_nobias.index.map(lambda x: usernew2old[x])
user_factor_nobias.head()
user_factor_bias = pd.DataFrame(outwithbias[1], columns = ["comp_1", "comp_2"])
user_factor_bias["bias"] = outwithbias[3]
user_factor_bias["userId"] = user_factor_bias.index.map(lambda x: usernew2old[x])
user_factor_bias.head()
user_factor_nobias.to_csv("user_vec_nobias.csv", index = False)
user_factor_bias.to_csv("user_vec_bias.csv", index = False)
we can make some interesting plot in the vector space.
moviedfpick.head()
fig = px.scatter(moviedfpick, x="latent_vec_0", y="latent_vec_1", hover_name="title",color = "year")
fig.update_layout(
margin=dict(l=30, r=30, t=30, b=30),
)
fig.write_html("Biasvecvsyear.html")
IFrame(src='./Biasvecvsyear.html', width=800, height=700)
fig = px.scatter(moviedfpick, x="latent_vec_0", y="latent_vec_1", hover_name="title",color = "revenue")
fig.update_layout(
margin=dict(l=30, r=30, t=30, b=30),
)
fig.write_html("Biasvecvsrevenue.html")
IFrame(src='./Biasvecvsrevenue.html', width=800, height=700)